1 Introduction

To correct for batch effects, we will use the Harmony package, since it was reported to be the fastest and most accurate method in a recent benchmarking effort. To this end, we will follow the “Integration of datasets using Harmony” vignette of the SeuratWrappers package.

Importantly, we will use the harmony coordinates (batch-corrected principal components) to compute a k-nearest neighbors (KNN) graph, which will be used to compute the proportion of doublet nearest neighbors (pDNN) for each cell (see the following notebook).

2 Pre-processing

2.1 Load packages

library(Seurat)
library(SeuratWrappers)
library(harmony)
library(tidyverse)

2.2 Parameters

# Paths
path_to_obj <- here::here("scRNA-seq/results/R_objects/seurat_merged_all.rds")
path_to_save_obj <- here::here("scRNA-seq/results/R_objects/seurat_integrated_with_doublets.rds")
path_tmp_dir <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/")
path_to_save_knn <- str_c(path_tmp_dir, "integrated_knn_graph.rds", sep = "")
path_to_save_doubl_annot <- str_c(path_tmp_dir, "doublet_preliminary_annotations.rds", sep = "") 
  

# Set k for the KNN graph
k <- 75

2.3 Load data

tonsil <- readRDS(path_to_obj)
tonsil
## An object of class Seurat 
## 28504 features across 347262 samples within 1 assay 
## Active assay: RNA (28504 features, 0 variable features)

3 Normalize

To normalize by sequencing depth, we will divide each count by the library size of the cell (total number of UMI) and log-transform it, similarly to other high-quality atlases, like the thymus and the heart atlas.

tonsil <- NormalizeData(
  tonsil,
  normalization.method = "LogNormalize",
  scale.factor = 1e4
)
tonsil[["RNA"]]@data[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##                                              
## AL627309.1 . . . . . . . .        . .        
## AL627309.3 . . . . . . . .        . .        
## AL627309.5 . . . . . . . .        . .        
## AL627309.4 . . . . . . . .        . .        
## AP006222.2 . . . . . . . .        . .        
## AL669831.2 . . . . . . . .        . .        
## LINC01409  . . . . . . . 1.119716 . 0.2179193
## FAM87B     . . . . . . . .        . .        
## LINC01128  . . . . . . . .        . .        
## LINC00115  . . . . . . . .        . .

4 Visualize UMAP without batch effect correction

tonsil <- tonsil %>%
  FindVariableFeatures(nfeatures = 3000) %>%
  ScaleData() %>% 
  RunPCA() %>%
  RunUMAP(reduction = "pca", dims = 1:30)
p1 <- DimPlot(tonsil, group.by = "library_name", pt.size = 0.1) + NoLegend()
p2 <- DimPlot(tonsil, group.by = "sex", pt.size = 0.1)
p3 <- DimPlot(tonsil, group.by = "age_group", pt.size = 0.1)
p4 <- DimPlot(tonsil, group.by = "is_hashed", pt.size = 0.1)
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
p1

print("UMAP colored by sex:")
## [1] "UMAP colored by sex:"
p2

print("UMAP colored by age:")
## [1] "UMAP colored by age:"
p3

print("UMAP colored by whether it's hashed:")
## [1] "UMAP colored by whether it's hashed:"
p4

5 Run and visualize Harmony’s integration

tonsil <- tonsil %>%
  RunHarmony(reduction = "pca", dims = 1:30, group.by.vars = "gem_id") %>%
  RunUMAP(reduction = "harmony", dims = 1:30)
p1_corr <- DimPlot(tonsil, group.by = "library_name", pt.size = 0.1) +
  NoLegend()
p2_corr <- DimPlot(tonsil, group.by = "sex", pt.size = 0.1)
p3_corr <- DimPlot(tonsil, group.by = "age_group", pt.size = 0.1)
p4_corr <- DimPlot(tonsil, group.by = "is_hashed", pt.size = 0.1)
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
p1_corr

print("UMAP colored by sex:")
## [1] "UMAP colored by sex:"
p2_corr

print("UMAP colored by age:")
## [1] "UMAP colored by age:"
p3_corr

print("UMAP colored by whether it's hashed:")
## [1] "UMAP colored by whether it's hashed:"
p4_corr

6 Compute KNN graph

tonsil <- FindNeighbors(
  tonsil,
  reduction = "harmony",
  dims = 1:30,
  k.param = k,
  compute.SNN = FALSE
)
tonsil
## An object of class Seurat 
## 28504 features across 347262 samples within 1 assay 
## Active assay: RNA (28504 features, 3000 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

7 Save

# If it doesn't exist create temporal directory
dir.create(path_tmp_dir, showWarnings = FALSE)


# Save
selected_cols <- c("HTO_classification.global", "has_high_lib_size",
                   "scrublet_predicted_doublet", "scrublet_doublet_scores",
                   "scrublet_doublet_scores_scaled")
doublet_annot_df <- tonsil@meta.data[, selected_cols]

knn_graph <- as(tonsil@graphs$RNA_nn, "sparseMatrix")
saveRDS(knn_graph, path_to_save_knn)
saveRDS(tonsil, path_to_save_obj)
#saveRDS(tonsil@graphs$RNA_nn, path_to_save_knn)
saveRDS(doublet_annot_df, path_to_save_doubl_annot)

8 Session Information

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
## 
## Matrix products: default
## BLAS:   /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
## 
## locale:
##  [1] LC_CTYPE=C                 LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0        stringr_1.4.0        dplyr_1.0.4          purrr_0.3.4          readr_1.3.1          tidyr_1.1.0          tibble_3.0.1         ggplot2_3.3.0        tidyverse_1.3.0      harmony_1.0          Rcpp_1.0.6           SeuratWrappers_0.2.0 Seurat_3.2.0         BiocStyle_2.14.4    
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_1.4-1      deldir_0.1-25         ellipsis_0.3.1        ggridges_0.5.2        rprojroot_1.3-2       fs_1.4.1              rstudioapi_0.11       spatstat.data_1.4-3   farver_2.0.3          leiden_0.3.3          listenv_0.8.0         remotes_2.2.0         ggrepel_0.8.2         RSpectra_0.16-0       fansi_0.4.1           lubridate_1.7.8       xml2_1.3.2            codetools_0.2-16      splines_3.6.0         knitr_1.28            polyclip_1.10-0       jsonlite_1.7.2        broom_0.5.6           ica_1.0-2             cluster_2.1.0         dbplyr_1.4.4          png_0.1-7             uwot_0.1.8            shiny_1.4.0.2         sctransform_0.2.1     BiocManager_1.30.10   compiler_3.6.0        httr_1.4.2            backports_1.1.7       assertthat_0.2.1      Matrix_1.2-18         fastmap_1.0.1         lazyeval_0.2.2        cli_2.0.2             later_1.0.0           htmltools_0.4.0       tools_3.6.0           rsvd_1.0.3            igraph_1.2.5          gtable_0.3.0          glue_1.4.1            RANN_2.6.1            reshape2_1.4.4        rappdirs_0.3.1        spatstat_1.64-1       cellranger_1.1.0      vctrs_0.3.6           ape_5.3              
##  [55] nlme_3.1-148          lmtest_0.9-37         xfun_0.14             globals_0.12.5        rvest_0.3.5           mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0       irlba_2.3.3           goftest_1.2-2         future_1.17.0         MASS_7.3-51.6         zoo_1.8-8             scales_1.1.1          hms_0.5.3             promises_1.1.0        spatstat.utils_1.17-0 parallel_3.6.0        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.16       pbapply_1.4-2         gridExtra_2.3         rpart_4.1-15          stringi_1.4.6         rlang_0.4.10          pkgconfig_2.0.3       evaluate_0.14         lattice_0.20-41       ROCR_1.0-11           tensor_1.5            labeling_0.3          patchwork_1.0.0       htmlwidgets_1.5.1     cowplot_1.0.0         tidyselect_1.1.0      here_0.1              RcppAnnoy_0.0.16      plyr_1.8.6            magrittr_1.5          bookdown_0.19         R6_2.4.1              generics_0.0.2        DBI_1.1.0             withr_2.4.1           pillar_1.4.4          haven_2.3.1           mgcv_1.8-31           fitdistrplus_1.1-1    survival_3.1-12       abind_1.4-5           future.apply_1.5.0    modelr_0.1.8          crayon_1.3.4         
## [109] KernSmooth_2.23-17    plotly_4.9.2.1        rmarkdown_2.2         readxl_1.3.1          grid_3.6.0            data.table_1.12.8     blob_1.2.1            reprex_0.3.0          digest_0.6.20         xtable_1.8-4          httpuv_1.5.3.1        munsell_0.5.0         viridisLite_0.3.0